There are two sequencing runs, each with the two fastq files from each of the 280 true samples and the 94 mock ones. In all they are 1496 fastq files. To visualize quality parameters of all them is too much.
library(fastqcr)
run1 <- '../../data/fastq'
run2 <- '../../data/fastr'
if (! file.exists('report1')) {
fastqc(fq.dir = run1, qc.dir = "reports1",
threads = 40, fastqc.path = "/usr/local/bin/fastqc")
}
if (! file.exists('report2')) {
fastqc(fq.dir = run2, qc.dir = "reports2",
threads = 40, fastqc.path = "/usr/local/bin/fastqc")
}
qc.files1 <- list.files(path = "reports1", all.files = FALSE,
full.names = TRUE, pattern = "zip")
# Sample names can be extracted from the file names
samples1 <- sapply(strsplit(qc.files1, "[/_]"), '[[', 2)
qc.files2 <- list.files(path = 'reports2', all.files = FALSE,
full.names = TRUE, pattern = 'zip')
# Some fake samples (HOP) got so little spurious reads that appear in
# only one of the two sequencing runs. I will limit to the subset of
# samples with fastq files from both sequencing runs.
samples2 <- sapply(strsplit(qc.files2, "[/_]"), '[[', 2)
samples <- intersect(samples1, samples2)
rm(samples1, samples2)
# The lists of fastq files from the two sequencing runs are not
# "parallel". I will use the common samples to filter the lists of files:
qc.files1 <- grep(paste0('(', paste(samples, collapse = '|'), ')'), qc.files1, value = TRUE)
qc.files2 <- grep(paste0('(', paste(samples, collapse = '|'), ')'), qc.files2, value = TRUE)
stopifnot(all.equal(substring(qc.files1, 10), substring(qc.files2, 10)))
To read collections of reports is slow, and we don't want to include all samples in a collection, because then the plots would not be visible. One way to split samples in meaningful groups would be to use the 3-letters prefix of their names, which indicate the lake of origin, I think.
table(substr(samples, 1, 3))
##
## 175 BIE BRZ HOP LAN SUO THU WAL ZUR
## 22 14 41 89 55 62 60 24 2
# filters for individual lakes:
f_lake <- lapply(unique(substr(samples, 1, 3)),
function(x) grepl(x, qc.files1))
# just to check
stopifnot(all.equal(as.numeric(table(substr(samples, 1, 3))),
sapply(f_lake, sum)/2))
names(f_lake) <- unique(substr(samples, 1, 3))
# And filters for forward and reverse reads:
f_R1 <- grepl('_R1_', qc.files1)
f_R2 <- grepl('_R2_', qc.files1)
# I create 4 lists of collections for each lake:
coll_R1.run1 <- lapply(unique(substr(samples, 1, 3)),
function(x) {
suppressMessages(qc_read_collection(
files = qc.files1[f_lake[[x]] & f_R1],
sample_names = grep(x, samples, value = TRUE),
modules = 'all',
verbose = FALSE
))
})
names(coll_R1.run1) <- unique(substr(samples, 1, 3))
coll_R2.run1 <- lapply(unique(substr(samples, 1, 3)),
function(x) {
suppressMessages(qc_read_collection(
files = qc.files1[f_lake[[x]] & f_R2],
sample_names = grep(x, samples, value = TRUE),
modules = 'all',
verbose = FALSE
))
})
names(coll_R2.run1) <- unique(substr(samples, 1, 3))
coll_R1.run2 <- lapply(unique(substr(samples, 1, 3)),
function(x) {
suppressMessages(qc_read_collection(
files = qc.files2[f_lake[[x]] & f_R1],
sample_names = grep(x, samples, value = TRUE),
modules = 'all',
verbose = FALSE
))
})
names(coll_R1.run2) <- unique(substr(samples, 1, 3))
coll_R2.run2 <- lapply(unique(substr(samples, 1, 3)),
function(x) {
suppressMessages(qc_read_collection(
files = qc.files2[f_lake[[x]] & f_R2],
sample_names = grep(x, samples, value = TRUE),
modules = 'all',
verbose = FALSE
))
})
names(coll_R2.run2) <- unique(substr(samples, 1, 3))
In order to reduce the size of the report and to be able to compare forward and reverse reads as well as the two sequencing runs, I will group plots by lake. Not all modules produce meaningful plots when the collection includes several reports. For example, the module "Per base sequence content" would try to produce as many plots as reports, instead of summarizing them in a single plot.
library('gridExtra')
library('ggplot2')
for (lake in unique(substr(samples, 1, 3))) {
whatPlot <- 'Per base sequence quality'
p1 <- qc_plot_collection(coll_R1.run1[[lake]], modules = whatPlot)
p2 <- qc_plot_collection(coll_R2.run1[[lake]], modules = whatPlot)
p3 <- qc_plot_collection(coll_R1.run2[[lake]], modules = whatPlot)
p4 <- qc_plot_collection(coll_R2.run2[[lake]], modules = whatPlot)
p1 <- p1 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 1')
p2 <- p2 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 1')
p3 <- p3 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 2')
p4 <- p4 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 2')
if (sum(f_lake[[lake]]) > 44) {
p1 <- p1 + guides(color = 'none')
p2 <- p2 + guides(color = 'none')
p3 <- p3 + guides(color = 'none')
p4 <- p4 + guides(color = 'none')
}
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
}
## Warning: Removed 76 row(s) containing missing values (geom_path).
## Warning: Removed 76 row(s) containing missing values (geom_path).
## Warning: Removed 38 row(s) containing missing values (geom_path).
## Warning: Removed 38 row(s) containing missing values (geom_path).
for (lake in unique(substr(samples, 1, 3))) {
whatPlot <- 'Per sequence quality scores'
p1 <- qc_plot_collection(coll_R1.run1[[lake]], modules = whatPlot)
p2 <- qc_plot_collection(coll_R2.run1[[lake]], modules = whatPlot)
p3 <- qc_plot_collection(coll_R1.run2[[lake]], modules = whatPlot)
p4 <- qc_plot_collection(coll_R2.run2[[lake]], modules = whatPlot)
p1 <- p1 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 1')
p2 <- p2 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 1')
p3 <- p3 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 2')
p4 <- p4 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 2')
if (sum(f_lake[[lake]]) > 44) {
p1 <- p1 + guides(color = 'none')
p2 <- p2 + guides(color = 'none')
p3 <- p3 + guides(color = 'none')
p4 <- p4 + guides(color = 'none')
}
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
}
for (lake in unique(substr(samples, 1, 3))) {
whatPlot <- 'Per sequence GC content'
p1 <- qc_plot_collection(coll_R1.run1[[lake]], modules = whatPlot)
p2 <- qc_plot_collection(coll_R2.run1[[lake]], modules = whatPlot)
p3 <- qc_plot_collection(coll_R1.run2[[lake]], modules = whatPlot)
p4 <- qc_plot_collection(coll_R2.run2[[lake]], modules = whatPlot)
p1 <- p1 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 1')
p2 <- p2 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 1')
p3 <- p3 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 2')
p4 <- p4 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 2')
if (sum(f_lake[[lake]]) > 44) {
p1 <- p1 + guides(color = 'none')
p2 <- p2 + guides(color = 'none')
p3 <- p3 + guides(color = 'none')
p4 <- p4 + guides(color = 'none')
}
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
}
for (lake in unique(substr(samples, 1, 3))) {
whatPlot <- 'Per base N content'
p1 <- qc_plot_collection(coll_R1.run1[[lake]], modules = whatPlot)
p2 <- qc_plot_collection(coll_R2.run1[[lake]], modules = whatPlot)
p3 <- qc_plot_collection(coll_R1.run2[[lake]], modules = whatPlot)
p4 <- qc_plot_collection(coll_R2.run2[[lake]], modules = whatPlot)
p1 <- p1 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 1')
p2 <- p2 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 1')
p3 <- p3 + ggtitle(lake, subtitle = 'Forward reads, sequencing run 2')
p4 <- p4 + ggtitle(lake, subtitle = 'Reverse reads, sequencing run 2')
if (sum(f_lake[[lake]]) > 44) {
p1 <- p1 + guides(color = 'none')
p2 <- p2 + guides(color = 'none')
p3 <- p3 + guides(color = 'none')
p4 <- p4 + guides(color = 'none')
}
grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
}
We have a lot of data. The quality is good. I don't know why the sequencing center repeated the sequencing run. We can be conservative when filtering.
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=ca_ES.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=ca_ES.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=ca_ES.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.5 gridExtra_2.3 fastqcr_0.1.2
##
## loaded via a namespace (and not attached):
## [1] highr_0.9 pillar_1.6.4 bslib_0.3.1 compiler_3.6.3
## [5] jquerylib_0.1.4 tools_3.6.3 bit_4.0.4 digest_0.6.29
## [9] jsonlite_1.7.2 evaluate_0.14 lifecycle_1.0.1 tibble_3.1.6
## [13] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.12 cli_3.1.0
## [17] DBI_1.1.1 parallel_3.6.3 yaml_2.2.1 xfun_0.28
## [21] fastmap_1.1.0 withr_2.4.3 dplyr_1.0.7 stringr_1.4.0
## [25] knitr_1.36 hms_1.1.1 generics_0.1.1 vctrs_0.3.8
## [29] sass_0.4.0 bit64_4.0.5 grid_3.6.3 tidyselect_1.1.1
## [33] glue_1.5.1 R6_2.5.1 fansi_0.5.0 vroom_1.5.7
## [37] rmarkdown_2.11 farver_2.1.0 tzdb_0.2.0 readr_2.1.1
## [41] purrr_0.3.4 magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.2
## [45] htmltools_0.5.2 assertthat_0.2.1 colorspace_2.0-2 labeling_0.4.2
## [49] utf8_1.2.2 stringi_1.7.6 munsell_0.5.0 crayon_1.4.2